In this report, I will do an exploratory data analysis between primary outcome and key variables. I will describe and summarize any variables that I create, produce summary statistics and depict some associations between the outcomes and key variables.
dim(data)
## [1] 132538 154
Get the dataset with the combination of hospitalization data and survey data. There are total 132538 observations and 154 variables.
Mortality outcome and time window
# convert deceased outcome as factor##
data$deceased<- as.factor(data$deceased)
table(data$deceased)
##
## 0 1
## 64099 68439
# create a variable that indicate the time window between discharge and death
data<-
data %>%
mutate(time_discharge_death=as.numeric(as.Date(date_of_death_dt)-as.Date(discharge_dt)))
summary(data$time_discharge_death)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -4950 178 953 1409 2227 6739 64099
data<-
data %>%
mutate(time_discharge_death=ifelse(time_discharge_death<0,NA,time_discharge_death)) # negative time_window results set as NA, dead before discharge or maybe input mistake
# convert time window variable into categorical variable, total 6 time window categories
data<-
data%>%
mutate(time_window= case_when(
time_discharge_death<=30 ~ "1", # a month less
time_discharge_death<=180 ~ "2", # half year less
time_discharge_death<=365 ~ "3", # 1 year less
time_discharge_death<=1825 ~ "4", # 5 years less
time_discharge_death<3650 ~ "5", # 10 years less
time_discharge_death>=3650 ~ "6", # 10 years more
TRUE~NA_character_
))
data$time_window<- as.factor(data$time_window)
table(data$time_window)
##
## 1 2 3 4 5 6
## 8507 8537 5330 24430 14708 6758
P1<-
data%>% # plot Mortality distribution
filter(!(deceased%in% NA))%>%
ggplot(aes(x=deceased))+
geom_bar(fill="steelblue",width = 0.6)+
labs(x="Deceased",y="Population",title = "Decesed Distribution")+
scale_x_discrete(labels=c("0"= "No", "1"="Yes"))+
theme_minimal()
P2<-
data%>% # counts death within time windows
filter(!(time_window%in% NA))%>%
ggplot(aes(x=time_window))+
geom_bar(fill= "steelblue")+
labs(x="Time Window", y="Population",title="Death within a certain time window ")+
scale_x_discrete(labels=c("1"= "30 days less","2"= "180 days less","3"= "1 year less",
"4"= "5 years less","5"= "10 years less","6"= "10 years more"))+
theme_minimal()+
theme(axis.text.x = element_text(angle=45,hjust=1))
P3<-
data%>% # death within time windows
filter(!(time_discharge_death%in% NA))%>%
ggplot(aes(x=time_discharge_death))+
geom_density(fill= "steelblue")+
labs(x=" Days", y="Density",title=" Death density within days after discharge ")+
theme_minimal()
# subplot mortality and time window
grid.arrange(P1,P3,P2,nrow=2,ncol=2)
The mortality variable has a balanced distribution.51.6% people are deceased in this data set. Then I created a variable that indicates the days between people dischage and death. The mean value is 1409 days and the longest time is 6739 days. There are negative values which indicate that people dead before discharge.The minimum value is -4950 days which does not make sense.I believe it is an input mistake. These negative values are not valid to futher analysis, so I impute them as NA. Furthermore, I divided the days into 6 categories indicating death within different timw windows after people discharge.The 6 categories are a month less,half year less,1 year less,5 years less, 10 years less and 10 years more.
Primary exposure: Length of stay, co-morbidities record and admission types
# length of stay
summary(data$length_of_stay_day_cnt)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 4.713 5.000 1792.000
P6<-
data%>% # LOS with time window type
filter(!(time_window%in% NA))%>%
ggplot(aes(x=time_window,y=length_of_stay_day_cnt))+
labs(x=" time windows", y="Length of Stay", title = "Length of Stay in time windows")+
geom_point(color= "steelblue")+
scale_x_discrete(labels=c("1"= "30 days less","2"= "180 days less","3"= "1 year less","4"= "5 years less","5"= "10 years less","6"= "10 years more"))+
stat_summary(fun=median,geom="point",shape=18,size=3,color="black")+
theme_minimal()
P6
# admission type and death
P7<-
data%>%
ggplot(aes(x=factor(admission_typ),fill=deceased))+
geom_bar(position = position_dodge())+
scale_fill_brewer(palette = "Paired",labels=c("0"="No","1"="Yes"))+
labs(x="Adimission type",y="Population",title="Death Population by admission types")+
scale_x_discrete(labels=c("0"="none report","1"="Scheduled","2"="Unscheduled","3"="Infant","4"="Unknown"))+
theme_minimal()
table(data$admission_typ)
##
## 0 1 2 3 4
## 10 47474 84975 5 74
P7
The median value of length of stay in all participants is 3 days. Majority of participants stay at hospital less than 5 days. People who have a short term time window (less than 30 days) have high proportion of longer satying in hospital than people have long term time window (more than 10 years).The longest time of staying in hospital is 1792 days. It is reasonale that people who have severe disease staying hospital longer and dead earlier. In admission type variable, the proportion of scheduled and unscheduled admission type are more than 99%. The unscheduled admission type has a highest deceased rate.
######### death cause/diagnose/procedure ICD9 code###########
data$major_diag_cat_cde<-as.factor(data$major_diag_cat_cde)
# major diagnostic categories in death population
P8<-
data%>%
ggplot(aes(y=factor(major_diag_cat_cde),fill=deceased))+
geom_bar(position = position_dodge())+
scale_fill_brewer(palette = "Paired",labels=c("0"="No","1"="Yes"))+
labs(x="population",y=" ", title="Major Diagnosis Disease and Disorders in Participants")+
scale_y_discrete(labels=c("0"="Ungroupable","1"="Nervous","2"="Eye","3"="Ear",
"4"="Respiratory System","5"="Circulatory System","6"="Digestive System",
"7"="Hepatobiliary System & Pancreas","8"="Musculoskeletal System ",
"9"="Skin & Breast","10"="Endocrine & Metabolic","11"="Kidney and Urinary Tract",
"12"="Male Reproductive System", "13"="Female Reproductive System",
"14"="Pregnancy & Childbirth","15"="Newborns and Neonate Conditions",
"16"="Blood & Immunological","17"="Myeloproliferative ",
"18"="Infectious & Parasitic","19"="Mental","20"="Alcohol-Drug Use",
"21"="Injuries & Poisonings","22"="Burns","23"= "Factors on Health Status",
"24"=" Multiple Signficant Trauma", "25"="Immunodeficiency Virus Infection"))+
theme_minimal()
P8
The major diagnosis disease categories are divided into 25 diagosis grouping. The top three diagnosis diseases for deceased people are circulatory system disease and disorder, musculoskeletal system disease and disorder and respiratory system disease and disorder. The top three diagnosis diseases for survived people are musculoskeletal system disease and disorder, respiratory system disease and disorder and digestive system disease and disorde.
Covariates: age at death, race and month at death
# month variable
data<-
data%>%
mutate(month=months(data$date_of_death_dt))
data$month<- as.numeric(as.factor(data$month))
data$month<-as.factor(data$month)
table(data$month)
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 5638 5063 6004 5905 6479 5424 5405 6124 5434 5793 5795 5375
P5<-
data%>% # month or season time with death variable
filter(!(month%in% NA))%>%
filter(deceased==1)%>%
ggplot(aes(y= month))+
geom_bar(fill="steelblue")+
labs(x="Population", y=" ",title=" Death population by Month ")+
scale_y_discrete(labels=c("1"= "Jau","2"= "Feb","3"= "Mar","4"= "Apr","5"= "May","6"= "Jun","7"= "Jul",
"8"= "Aug","9"= "Sep","10"= "Oct","11"= "Nov","12"= "Dec"))+
theme_minimal()
P5
######## demographic predictors and other covatiates #############
#create a variable of age at death
data<-
data%>%
mutate(age_death=as.numeric(year(date_of_death_dt)-year(date_of_birth_dt)))
P9<-
data%>% # age distribution at death
filter(!(deceased%in% NA))%>%
filter(!(age_death%in% NA))%>%
ggplot(aes(x=age_death))+
geom_histogram(bins = 20,fill="steelblue")+
labs(x="Age at Death",y="Count",title="Age Distribution at Death")+
theme_minimal()
#race and death variable
data$participant_race<-as.factor(data$participant_race)# convert race as dummy variable
P10<-
data%>%
ggplot(aes(x=factor(participant_race),fill=deceased))+
geom_bar()+
scale_x_discrete(labels=c("0"="none reported","1"="White","2"="black","3"="hispanic","4"="native American",
"5"="Asian or Pacific Islander","6"="other"))+
labs(x="",y="Ppopulation",title="Death by Participant race")+
scale_fill_brewer(palette = "Paired",labels=c("0"="No","1"="Yes"))+
theme_minimal()+
theme(axis.text.x = element_text(angle=45,hjust=1))
summary(data$age_death)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 34.00 79.00 86.00 84.36 92.00 111.00 64099
table(data$participant_race)
##
## 0 1 2 3 4 5 6
## 1159 119349 3230 3368 1323 3016 1093
P9; P10
Month variable was created to indicate the death month between these deceased population. The month during years has a similar value. Age at death is also an important predictors for mortality. I created a variable of age at death to indicate the age distribution for those deceased people after discharge. The death year sustract the year of birth is the age at death. The median age of deceased people is 86 years old. There are 6 categories of race for participants.90% of these participants are white.The deceasecd rate in these white people is approximate 50%.
# normalize total charges
data$total_charges_amt<- as.numeric(round(data$total_charges_amt/median(data$total_charges_amt),2))
# drop predictors with more than 80% missing values
data[c("qnr_4_mini_fill_dt","qnr_5_fill_dt","qnr_5_mini_fill_dt","qnr_6_fill_dt","bilateral_mastectomy_dt",
"bilateral_oophorectomy_dt","first_moveout_ca_dt","endoca_self_q1","cervca_self_q1","ovryca_self_q1",
"lungca_self_q1","leuk_self_q1","hodg_self_q1","colnca_self_q1","thyrca_self_q1","meln_self_q1",
"diab_self_q1","hbp_self_q1","stroke_self_q1","hrtatk_self_q1","vit_reg_no","smoke_yrs_quit",
"src_site_cde","src_licensure_cde","src_route_cde","spoken_lang_cde","proc_icd3","proc_icd4","proc_icd5",
"proc_icd_dsc3","proc_icd_dsc4","proc_icd_dsc5","proc_ccs3","proc_ccs4","proc_ccs5","X")]<- list(NULL)
# drop the redundant and unnecessary predictors
data[c("breast_cancer_res_only_ind","participant_key","date_of_birth_dt",
"qnr_1_fill_dt","qnr_2_fill_dt","qnr_3_fill_dt","qnr_4_fill_dt","allex_hrs_q1","nih_ethnic_cat",
"stand_hrs","sit_hrs","vit_mulvit_q1","serv_fats_q1","alchl_g_dayrecen","smoke_lifeexpo",
"passmok_expocat","smoke_totyrs","cig_day_avg","smoke_totpackyrs","shsmok_any","visit_id","admission_dt",
"src_admission_cde","facility_county_cde","facility_zip5_cde","payer_plan_cde","age_mom_atbirth","age_dad_atbirth",
"oralcntr_yrs","fullterm_age1st","preg_total_q1","meno_stattype","brca_selfsurvey","smoke_expocat","near_oilrefine","season")]<- list(NULL)
# convert some variables to dummy variables
# convert some variables to numeric variables
## check other missing value and verify no NA data / impute the NA data?
## covariates : family hostory, phisical activity, diet, alcohol and tobacoo use.
######### death cause/diagnose/procedure ICD9 code ###########
To do list:
- Data preprocessing: tidy up the dataset before it could be used for the prediction model.
- drop all variables with more than 80% of missing values.
- drop all redandunt and unnecessary variables.
- check other missing values and identify no NA data.
- convert certain predictors into dummy variables.
- covariates: diet, smoke, tobacco, alcohol
- ICD-9 code
- Model development and select the best fit model
- Regularization Logistic Regression
- KNN classification
- Random Forest
- Gradient Booting